Introduction

Julkunen, H., Cichońska, A., Tiainen, M. et al. Atlas of plasma NMR biomarkers for health and disease in 118,461 individuals from the UK Biobank. Nat Commun 14, 604 (2023). https://doi.org/10.1038/s41467-023-36231-7

Preparations

Load Data

data <- 
  readr::read_csv(
    file = "C:/Users/tsuv0001/Documents/data-nonsensitive-local-copies/ukb_nightingale_biomarker_disease_association_atlas.csv" # Path to the data file downloaded from https://biomarker-atlas.nightingale.cloud/ .
  )
## Rows: 1400595 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): age_group, biomarker_id, biomarker_name, group_name, endpoint_type...
## dbl  (5): estimate, se, pvalue, n, n_events
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

List Measured Biomarkers

names.biomarkers <- sort( names( table( data$"biomarker_name" ) ) )

names.biomarkers
##   [1] "3-Hydroxybutyrate"    "Acetate"              "Acetoacetate"        
##   [4] "Acetone"              "Alanine"              "Albumin"             
##   [7] "ApoA1"                "ApoB"                 "ApoB/ApoA1"          
##  [10] "Citrate"              "Clinical LDL-C"       "Creatinine"          
##  [13] "DHA"                  "DHA %"                "Glucose"             
##  [16] "Glutamine"            "Glycine"              "Glycoprotein acetyls"
##  [19] "HDL-C"                "HDL-CE"               "HDL-FC"              
##  [22] "HDL-L"                "HDL-P"                "HDL-PL"              
##  [25] "HDL-TG"               "HDL particle size"    "Histidine"           
##  [28] "IDL-C"                "IDL-C %"              "IDL-CE"              
##  [31] "IDL-CE %"             "IDL-FC"               "IDL-FC %"            
##  [34] "IDL-L"                "IDL-P"                "IDL-PL"              
##  [37] "IDL-PL %"             "IDL-TG"               "IDL-TG %"            
##  [40] "Isoleucine"           "L-HDL-C"              "L-HDL-C %"           
##  [43] "L-HDL-CE"             "L-HDL-CE %"           "L-HDL-FC"            
##  [46] "L-HDL-FC %"           "L-HDL-L"              "L-HDL-P"             
##  [49] "L-HDL-PL"             "L-HDL-PL %"           "L-HDL-TG"            
##  [52] "L-HDL-TG %"           "L-LDL-C"              "L-LDL-C %"           
##  [55] "L-LDL-CE"             "L-LDL-CE %"           "L-LDL-FC"            
##  [58] "L-LDL-FC %"           "L-LDL-L"              "L-LDL-P"             
##  [61] "L-LDL-PL"             "L-LDL-PL %"           "L-LDL-TG"            
##  [64] "L-LDL-TG %"           "L-VLDL-C"             "L-VLDL-C %"          
##  [67] "L-VLDL-CE"            "L-VLDL-CE %"          "L-VLDL-FC"           
##  [70] "L-VLDL-FC %"          "L-VLDL-L"             "L-VLDL-P"            
##  [73] "L-VLDL-PL"            "L-VLDL-PL %"          "L-VLDL-TG"           
##  [76] "L-VLDL-TG %"          "LA"                   "LA %"                
##  [79] "Lactate"              "LDL-C"                "LDL-CE"              
##  [82] "LDL-FC"               "LDL-L"                "LDL-P"               
##  [85] "LDL-PL"               "LDL-TG"               "LDL particle size"   
##  [88] "Leucine"              "M-HDL-C"              "M-HDL-C %"           
##  [91] "M-HDL-CE"             "M-HDL-CE %"           "M-HDL-FC"            
##  [94] "M-HDL-FC %"           "M-HDL-L"              "M-HDL-P"             
##  [97] "M-HDL-PL"             "M-HDL-PL %"           "M-HDL-TG"            
## [100] "M-HDL-TG %"           "M-LDL-C"              "M-LDL-C %"           
## [103] "M-LDL-CE"             "M-LDL-CE %"           "M-LDL-FC"            
## [106] "M-LDL-FC %"           "M-LDL-L"              "M-LDL-P"             
## [109] "M-LDL-PL"             "M-LDL-PL %"           "M-LDL-TG"            
## [112] "M-LDL-TG %"           "M-VLDL-C"             "M-VLDL-C %"          
## [115] "M-VLDL-CE"            "M-VLDL-CE %"          "M-VLDL-FC"           
## [118] "M-VLDL-FC %"          "M-VLDL-L"             "M-VLDL-P"            
## [121] "M-VLDL-PL"            "M-VLDL-PL %"          "M-VLDL-TG"           
## [124] "M-VLDL-TG %"          "MUFA"                 "MUFA %"              
## [127] "non-HDL-C"            "Omega-3"              "Omega-3 %"           
## [130] "Omega-6"              "Omega-6 %"            "Omega-6/Omega-3"     
## [133] "Phenylalanine"        "Phosphatidylcholines" "Phosphoglycerides"   
## [136] "PUFA"                 "PUFA %"               "PUFA/MUFA"           
## [139] "Pyruvate"             "Remnant-C"            "S-HDL-C"             
## [142] "S-HDL-C %"            "S-HDL-CE"             "S-HDL-CE %"          
## [145] "S-HDL-FC"             "S-HDL-FC %"           "S-HDL-L"             
## [148] "S-HDL-P"              "S-HDL-PL"             "S-HDL-PL %"          
## [151] "S-HDL-TG"             "S-HDL-TG %"           "S-LDL-C"             
## [154] "S-LDL-C %"            "S-LDL-CE"             "S-LDL-CE %"          
## [157] "S-LDL-FC"             "S-LDL-FC %"           "S-LDL-L"             
## [160] "S-LDL-P"              "S-LDL-PL"             "S-LDL-PL %"          
## [163] "S-LDL-TG"             "S-LDL-TG %"           "S-VLDL-C"            
## [166] "S-VLDL-C %"           "S-VLDL-CE"            "S-VLDL-CE %"         
## [169] "S-VLDL-FC"            "S-VLDL-FC %"          "S-VLDL-L"            
## [172] "S-VLDL-P"             "S-VLDL-PL"            "S-VLDL-PL %"         
## [175] "S-VLDL-TG"            "S-VLDL-TG %"          "SFA"                 
## [178] "SFA %"                "Sphingomyelins"       "TG/PG"               
## [181] "Total-C"              "Total-CE"             "Total-FC"            
## [184] "Total-L"              "Total-P"              "Total-PL"            
## [187] "Total BCAA"           "Total cholines"       "Total fatty acids"   
## [190] "Total triglycerides"  "Tyrosine"             "Unsaturation"        
## [193] "Valine"               "VLDL-C"               "VLDL-CE"             
## [196] "VLDL-FC"              "VLDL-L"               "VLDL-P"              
## [199] "VLDL-PL"              "VLDL-TG"              "VLDL particle size"  
## [202] "XL-HDL-C"             "XL-HDL-C %"           "XL-HDL-CE"           
## [205] "XL-HDL-CE %"          "XL-HDL-FC"            "XL-HDL-FC %"         
## [208] "XL-HDL-L"             "XL-HDL-P"             "XL-HDL-PL"           
## [211] "XL-HDL-PL %"          "XL-HDL-TG"            "XL-HDL-TG %"         
## [214] "XL-VLDL-C"            "XL-VLDL-C %"          "XL-VLDL-CE"          
## [217] "XL-VLDL-CE %"         "XL-VLDL-FC"           "XL-VLDL-FC %"        
## [220] "XL-VLDL-L"            "XL-VLDL-P"            "XL-VLDL-PL"          
## [223] "XL-VLDL-PL %"         "XL-VLDL-TG"           "XL-VLDL-TG %"        
## [226] "XS-VLDL-C"            "XS-VLDL-C %"          "XS-VLDL-CE"          
## [229] "XS-VLDL-CE %"         "XS-VLDL-FC"           "XS-VLDL-FC %"        
## [232] "XS-VLDL-L"            "XS-VLDL-P"            "XS-VLDL-PL"          
## [235] "XS-VLDL-PL %"         "XS-VLDL-TG"           "XS-VLDL-TG %"        
## [238] "XXL-VLDL-C"           "XXL-VLDL-C %"         "XXL-VLDL-CE"         
## [241] "XXL-VLDL-CE %"        "XXL-VLDL-FC"          "XXL-VLDL-FC %"       
## [244] "XXL-VLDL-L"           "XXL-VLDL-P"           "XXL-VLDL-PL"         
## [247] "XXL-VLDL-PL %"        "XXL-VLDL-TG"          "XXL-VLDL-TG %"

Omit Lipoprotein Particle Descriptors

names.biomarkers.printed <- names.biomarkers

names.biomarkers.printed <-
  names.biomarkers.printed[ 
    !grepl(
      x = names.biomarkers.printed,
      pattern = "DL"
    )
  ]

names.biomarkers.printed
##  [1] "3-Hydroxybutyrate"    "Acetate"              "Acetoacetate"        
##  [4] "Acetone"              "Alanine"              "Albumin"             
##  [7] "ApoA1"                "ApoB"                 "ApoB/ApoA1"          
## [10] "Citrate"              "Creatinine"           "DHA"                 
## [13] "DHA %"                "Glucose"              "Glutamine"           
## [16] "Glycine"              "Glycoprotein acetyls" "Histidine"           
## [19] "Isoleucine"           "LA"                   "LA %"                
## [22] "Lactate"              "Leucine"              "MUFA"                
## [25] "MUFA %"               "Omega-3"              "Omega-3 %"           
## [28] "Omega-6"              "Omega-6 %"            "Omega-6/Omega-3"     
## [31] "Phenylalanine"        "Phosphatidylcholines" "Phosphoglycerides"   
## [34] "PUFA"                 "PUFA %"               "PUFA/MUFA"           
## [37] "Pyruvate"             "Remnant-C"            "SFA"                 
## [40] "SFA %"                "Sphingomyelins"       "TG/PG"               
## [43] "Total-C"              "Total-CE"             "Total-FC"            
## [46] "Total-L"              "Total-P"              "Total-PL"            
## [49] "Total BCAA"           "Total cholines"       "Total fatty acids"   
## [52] "Total triglycerides"  "Tyrosine"             "Unsaturation"        
## [55] "Valine"

Select Population and Endpoint

table( data$"age_group" )
## 
##  1st tertile (39-53, statin use 6%) 2nd tertile (54-61, statin use 17%) 
##                              352522                              346890 
## 3rd tertile (62-71, statin use 30%)                     Full population 
##                              343869                              357314
table( data$"endpoint_type" )
## 
##  incident mortality prevalent 
##    708025     72697    619873
data.plot <- data

data.plot <- data.plot[ data.plot$"age_group" == "Full population", ]

data.plot <- data.plot[ data.plot$"endpoint_type" == "incident", ]

data.plot <- 
  data.plot[ data.plot$"biomarker_name" %in% names.biomarkers.printed, ]

Principal Component Analysis (PCA)

Preparations

Create a Wide Data Matrix

  • Biomarkers as columns
  • Conditions as rows
  • Additional columns on condition name and disease group for use in the plot
data.plot.pca <-
  tidyr::pivot_wider(
    data = data.plot,
    id_cols = icd10_desc,
    names_from = biomarker_name,
    # values_from = estimate.significant
    values_from = estimate
  )

tmp <- data.plot.pca$"icd10_desc"

# Extract disease group from disease code (condition).
# Extract most increased and most decreased metabolites for each condition.

data.plot.pca <- 
  data.frame( 
    Condition_Name = unlist( data.plot.pca[ , 1 ] ),
    Disease_Group =
      stringr::str_sub(
        string = unlist( data.plot.pca[ , 1 ] ),
        start = 1,
        end = 1
      ),
    Top_Increase =
      colnames( data.plot.pca )[
        apply(
          X = data.plot.pca[ , -1 ],
          MAR = 1,
          FUN = which.max
        ) + 1
      ],
    Top_Decrease =
      colnames( data.plot.pca )[
        apply(
          X = data.plot.pca[ , -1 ],
          MAR = 1,
          FUN = which.min
        ) + 1
      ],
    Condition = NA,
    data.plot.pca[ , -1 ]
    )

# Create a text for tooltip.

data.plot.pca$"Condition" <-
  paste0(
    data.plot.pca$"Condition_Name",
    "\n\nMost Increased: ",
    data.plot.pca$"Top_Increase",
    "\nMost Decreased: ",
    data.plot.pca$"Top_Decrease"
  )

# Set disease codes as rownames.

rownames( data.plot.pca ) <- tmp

Omit Rows with No Values

  • Disease codes that have missing data on any of the biomarkers.
has.any.missing <-
  apply(
    X = is.na( data.plot.pca[ , -( 1:5 ) ] ),
    MAR = 1,
    FUN = any
  )

table( has.any.missing )
## has.any.missing
## FALSE  TRUE 
##   701    13
data.plot.pca <- data.plot.pca[ !has.any.missing, ]

Compute PCA

  • Omitting the metadata columns.
result.pca <- prcomp( data.plot.pca[ , -( 1:5 ) ] )

Show Non-Interactive PCA

  • Conditions as points (scores)
  • Metabolites as loadings
  • Color by disease group
  • Further information about the condition in tooltip (via shape; later on used by plotly::ggplotly)
library( "ggfortify" )
## Loading required package: ggplot2
plot.pca <- 
  autoplot(
    result.pca,
    data = data.plot.pca,
    alpha = 0.5,
    loadings.colour = "black",
    loadings.label = TRUE,
    loadings.label.colour = "black",
    shape = "Condition",
    colour = "Disease_Group"
  ) +
  ggplot2::theme( legend.position = "none" )
print( plot.pca )
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 701. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 695 rows containing missing values (geom_point).

Create the Interactive PCA Figure

plot.pca.interactive <-
  plotly::ggplotly( 
    p = plot.pca,
    tooltip = "shape"
  )
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 701. Consider
## specifying shapes manually if you must have them.

Show the Interactive PCA Figure

plot.pca.interactive

Heatmap

Preparations

Show Non-Significant Values as Zero

data.plot$"Association" <- data.plot$"estimate"

data.plot[ data.plot$"pvalue" > 0.01, "Association" ] <- 0

Create a Wide Data Matrix

data.plot.wide <-
  tidyr::pivot_wider(
    data = data.plot,
    id_cols = icd10_desc,
    names_from = biomarker_name,
    values_from = Association
  )

tmp <- data.plot.wide$"icd10_desc"

data.plot.wide <- as.matrix( data.plot.wide[ , -1 ] )

rownames( data.plot.wide ) <- tmp

Fill in Missing Disease-Metabolite Values as Zero

data.plot.wide[ is.na( data.plot.wide ) ] <- 0

Omit Conditions with No Associations

is.all.zeros <- 
  apply(
    X = data.plot.wide == 0,
    MAR = 1,
    FUN = all
  )

table( is.all.zeros )
## is.all.zeros
## FALSE  TRUE 
##   607   107
conditions.all.zeros <- rownames( data.plot.wide )[ is.all.zeros ]

data.plot.wide <- data.plot.wide[ !is.all.zeros, ]

Create Non-Interactive Heatmap

result.heatmap <-
  gplots::heatmap.2(
    x = data.plot.wide,
    trace = "none",
    tracecol = "black",
    col = gplots::bluered( n = 99 ),
    margins = c( 15, 15 ),
    srtCol = 45,
    cexCol = 2
  )

Define the Row and Column Order in Long Data

  • Use the order from hierarchical clustering of the non-interactive heatmap.
  • The should be possible to specify in the call to plotly::layout, but for some reason it does not work for the row order.
  • Instead, add a running three-digit number as suffix in the condition name to enforce the desired row order based on alphabetical order.
data.plot.heatmap <- data.plot

data.plot.heatmap <- 
  data.plot.heatmap[ 
    !( data.plot.heatmap$"icd10_desc" %in% conditions.all.zeros ),
  ]

data.plot.heatmap$"biomarker_name" <-
  factor(
    x = data.plot.heatmap$"biomarker_name",
    levels = colnames( data.plot.wide )[ result.heatmap$"colInd" ]
  )

data.plot.heatmap$"icd10_desc" <-
  factor(
    x = data.plot.heatmap$"icd10_desc",
    levels = rownames( data.plot.wide )[ result.heatmap$"rowInd" ],
    labels = 
      paste( 
        formatC(
          x = 1:nrow( data.plot.wide ),
          flag = "0",
          width = 3
        ),
        rownames( data.plot.wide )[ result.heatmap$"rowInd" ]
      )
  )

Define Tooltip

  • Show the p-value of the association between the condition and metabolite.
data.plot.heatmap$"annotation" <-
  paste0(
    "p: ",
    data.plot.heatmap$"pvalue"
  )

Set Colorscale Midpoint to White and Zero

val.min <- round( min( data.plot.heatmap$"Association" ) * 100 )

val.max <- round( max( data.plot.heatmap$"Association" ) * 100 )

range <- max( abs( val.max ), abs( val.min ) ) * 2

palette <- gplots::bluered( n = range )

palette <- palette[ round( range / 2 + val.min ) : round( range / 2 + val.max ) ]

Show Non-Significant Values as Missing Data

data.plot.heatmap$"Association"[ data.plot.heatmap$"pvalue" > 0.01 ] <- NA

Cut Condition Names Shorter

data.plot.heatmap$"icd10_desc" <-
  stringr::str_sub(
    string = data.plot.heatmap$"icd10_desc",
    start = 1,
    end = 30
  )

Create the Interactive Heatmap

# Create the heatmap.

plot.heatmap.interactive <-
  plotly::plot_ly(
    data = data.plot.heatmap,
    x = ~biomarker_name,
    y = ~icd10_desc,
    z = ~Association,
    text = ~annotation,
    colors = gplots::bluered( n = 99 ),
    type = "heatmap"
  )

# Define the colorscale as symmetric.

plot.heatmap.interactive <-
plotly::colorbar(
  p = plot.heatmap.interactive,
    limits = 
      c( -1, 1 ) * max( x = abs( data.plot.heatmap$"Association" ), na.rm = TRUE )
  )

# Omit axis titles, adjust text size and omit background grid.

plot.heatmap.interactive <-
  plotly::layout(
    p = plot.heatmap.interactive,
    xaxis = 
      list(
        # categoryorder = "array",
        # categoryarray = colnames( data.plot.wide )[ result.heatmap$"colInd"],
        showgrid = FALSE,
        tickfont = list( size = 7 ),
        title = ""
      ),
    yaxis = 
      list(
        # categoryorder = "array",
        # categoryarray = rownames( data.plot.wide )[ result.heatmap$"rowInd" ],
        showgrid = FALSE,
        tickfont = list( size = 7 ),
        title = ""
      )
  )

Show the Interactive Heatmap

plot.heatmap.interactive

SessionInfo

utils::sessionInfo()
## R version 4.2.0 (2022-04-22 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Europe.utf8  LC_CTYPE=English_Europe.utf8   
## [3] LC_MONETARY=English_Europe.utf8 LC_NUMERIC=C                   
## [5] LC_TIME=English_Europe.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggfortify_0.4.14 ggplot2_3.3.6   
## 
## loaded via a namespace (and not attached):
##  [1] gtools_3.9.2.1     tidyselect_1.1.2   xfun_0.31          bslib_0.3.1       
##  [5] purrr_0.3.4        colorspace_2.0-3   vctrs_0.4.1        generics_0.1.2    
##  [9] viridisLite_0.4.0  htmltools_0.5.2    yaml_2.3.5         utf8_1.2.2        
## [13] plotly_4.10.0      rlang_1.0.2        jquerylib_0.1.4    pillar_1.7.0      
## [17] glue_1.6.2         withr_2.5.0        bit64_4.0.5        lifecycle_1.0.1   
## [21] stringr_1.4.0      munsell_0.5.0      gtable_0.3.0       caTools_1.18.2    
## [25] htmlwidgets_1.5.4  evaluate_0.15      labeling_0.4.2     knitr_1.39        
## [29] tzdb_0.3.0         fastmap_1.1.0      crosstalk_1.2.0    parallel_4.2.0    
## [33] fansi_1.0.3        highr_0.9          KernSmooth_2.23-20 readr_2.1.2       
## [37] scales_1.2.0       vroom_1.5.7        jsonlite_1.8.0     farver_2.1.0      
## [41] bit_4.0.4          gplots_3.1.3       gridExtra_2.3      hms_1.1.1         
## [45] digest_0.6.29      stringi_1.7.6      dplyr_1.0.9        grid_4.2.0        
## [49] bitops_1.0-7       cli_3.3.0          tools_4.2.0        magrittr_2.0.3    
## [53] sass_0.4.1         lazyeval_0.2.2     tibble_3.1.7       crayon_1.5.1      
## [57] tidyr_1.2.0        pkgconfig_2.0.3    ellipsis_0.3.2     data.table_1.14.2 
## [61] rmarkdown_2.14     httr_1.4.3         rstudioapi_0.13    R6_2.5.1          
## [65] compiler_4.2.0

Appendix

if ( file.exists( "README.html" ) ) {
  
  file.copy( 
    from = "README.html",
    to = "index.html",
    overwrite = TRUE
  )
  
}
## [1] TRUE